---
title: "The scope of software patent protection"
author: "Yu-Kai Lin, Arun Rai"
date: "`r Sys.Date()`"
output: html_document
---
  
```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE, message = FALSE, warning = FALSE)
```

## Prerequisites

To replicate the analysis in this paper, it is important to ensure that all needed data files and R packages are available on the test machine. 

### Data

* **sw_patents.csv** (available from Dataverse): This file contains a list of software patents granted between 1976 and 2018. They were identifed as software patents using keywords (BH2007), international patent classification (GM2003), or US patent classification (CAHP2019).

* **patents.csv** (available from Dataverse): This file contains a list of patent numbers and grant date for all the patents granted between 1976 and 2018. 

* **qtr_data_without_comp.csv** (available from Dataverse): This file contains quarterly firm-level observations from 2010 to 2018 regarding patenting outcomes and GitHub commits. 

* **comp_fundq.csv** (proprietary data; downloadable from Wharton Research Data Services): This file contains public firms' quarterly financial data from 2010 to 2018 (Compustat Daily Updates - Fundamentals Quarterly). At the minimum, the file should include the following variables: `gvkey`, `datadate`, `datacqtr`, `atq`, `chq`, `cshoq`, `dlcq`, `dlttq`, `pstkq`, `prccq`, `saleq`, `sic`. 

* **comp_funda.csv** (proprietary data; downloadable from Wharton Research Data Services): This file contains public firms' annual financial data from 2010 to 2018 (Compustat Daily Updates - Fundamentals Annual). At the minimum, the file should include the following variables: `gvkey`, `datadate`, `fyear`, `indfmt`, `emp`, `xrd`. 


### R Packages

```{r}
if(!require(pacman)) install.packages("pacman")

pacman::p_load("imputeTS", "grid", "ggthemes", "table1",
               "lfe", "sjPlot", "lubridate", "tidyverse")
#devtools::install_github("strengejacke/strengejacke")
```


## Figures

The function below provides the ggplot theme that we used in our data visualization (credit: [Koundinya Desiraju](https://rpubs.com/Koundy/71792))
```{r}
theme_Publication <- function(base_size=14) {
  library(grid)
  library(ggthemes)
  (theme_foundation(base_size=base_size) + 
      theme(
        panel.background = element_rect(colour = NA),
        plot.background = element_rect(colour = NA),
        panel.border = element_rect(colour = NA),
        axis.title = element_text(face = "bold",size = rel(1)),
        axis.title.y = element_text(angle=90,vjust =2),
        axis.title.x = element_text(vjust = -0.2),
        axis.text = element_text(), 
        axis.line = element_line(colour="black"),
        axis.ticks = element_line(),
        panel.grid.major = element_line(colour="#f0f0f0"),
        panel.grid.minor = element_blank(),
        legend.key = element_rect(colour = NA),
        legend.position = "bottom",
        legend.direction = "horizontal",
        legend.key.size= unit(0.7, "cm"),
        legend.spacing = unit(0.5, "cm"),
        legend.title = element_text(face="italic"),
        plot.margin=unit(c(10,5,5,5),"mm"),
        strip.background=element_rect(colour="#f0f0f0",fill="#f0f0f0"),
        strip.text = element_text(face="bold")
      ))
}
```



### Fig 1

```{r}
qtr_data <- read_csv("qtr_data_without_comp.csv") %>% 
  mutate(group = factor(group, levels = c("SW", "IT-NON-SW", "NON-IT"))) 

fig1_df = qtr_data %>% 
  group_by(group, period_yyyyq) %>% 
  summarise(
    avg_n_patents=mean(n_patents)
  ) %>% 
  ungroup() %>% 
  arrange(group, period_yyyyq) %>% 
  mutate(before_after = ifelse(
    period_yyyyq >= 20143, "After Alice", "Before Alice")) %>% 
  mutate(group2 = str_c(group, " (", before_after, ")")) %>% 
  mutate(
    group2 = factor(group2, levels = 
                      c("SW (Before Alice)", "SW (After Alice)", 
                        "IT-NON-SW (Before Alice)", "IT-NON-SW (After Alice)",
                        "NON-IT (Before Alice)", "NON-IT (After Alice)"))
  ) %>% 
  mutate(period_yyyyq=factor(period_yyyyq))

ggplot(
  fig1_df, aes(x=period_yyyyq, y=avg_n_patents, 
               group=group2, color=group2, shape=group2)) +
  geom_point(size=3)+
  geom_vline(xintercept=which(levels(fig1_df$period_yyyyq) == "20142")+0.3, 
             linetype='solid', color="#e67e22", size=1) + 
  geom_vline(xintercept=which(levels(fig1_df$period_yyyyq) == "20144")+0.3, 
             linetype='solid', color="#e67e22", size=1) + 
  geom_text(
    aes(x=which(levels(fig1_df$period_yyyyq) == "20142")-0.4, 
        y=6.4, label="Ruling on Alice", family="Arial"), 
    colour="#e67e22", angle=90, size=4) +
  geom_text(
    aes(x=which(levels(fig1_df$period_yyyyq) == "20144")-0.4, 
        y=7, label="IEG from USPTO", family="Arial"), 
    colour="#e67e22", angle=90, size=4) +
  scale_color_manual(
    values = c("#A5CADB", "#1565c0", "#e57373", "#c62828", "#b2babb", "#707b7c")) +
  scale_shape_manual(values = c(16,16,17,17,15,15)) +
  scale_x_discrete(breaks=paste0(rep(c(2010:2018),each=4), c(1,3)), 
                   labels=paste0(rep(c(2010:2018),each=4), c("Q1", "Q3")))+
  labs(y="Average number of\nnew patents per firm", x="Time", 
       color='Group', shape='Group') +
  theme_Publication() +
  theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1),
        axis.title.x=element_blank(),
        plot.margin = margin(0.2, 0.2, 0.2, 0.2, "cm"))
```

### Fig 2

```{r}
qtr_data <- read_csv("qtr_data_without_comp.csv") %>% 
  mutate(group = factor(group, levels = c("SW", "IT-NON-SW", "NON-IT"))) 

comp_fundq <- read_csv("comp_fundq.csv")
comp_funda <- read_csv("comp_funda.csv") %>% 
  filter(indfmt == "INDL")

comp = comp_fundq %>% 
  left_join(comp_funda, by=c("gvkey", "datadate")) %>% 
  mutate(
    gvkey = as.numeric(gvkey),
    period_yyyyq = as.numeric(str_replace(datacqtr, "Q","")),
    # Bharadwaj et al. (1999)
    tobins_q = (cshoq * prccq + pstkq + dlttq + dlcq)/atq 
  ) %>% 
  arrange(gvkey, period_yyyyq) %>% 
  filter(!is.na(period_yyyyq)) %>% 
  mutate(
    # Bloom et al. (2013):
    # Tobin’s Q was winsorized by setting it to 0.1 for values below 0.1
    # and at 20 for values above 20 (see also Lanjouw and Schankerman (2004)).
    tobins_q = case_when(
      tobins_q < .1 ~ .1,
      tobins_q > 20 ~ 20,
      T             ~ tobins_q
    )
  ) %>% 
  mutate(
    # Left-truncate the following variable at 0 
    saleq=ifelse(saleq<0,0,saleq)
  )

fig2_df = qtr_data %>% 
  left_join(comp, by = c("gvkey", "period_yyyyq"))  %>% 
  mutate(
    log_tobins_q = log1p(tobins_q),
    log_saleq = log1p(saleq),
    log_github_n_commits = log1p(github_n_commits),
    log_avg_n_claims_per_patent = log1p(avg_n_claims_per_patent)
  ) %>% 
  mutate(before_after = ifelse(period_yyyyq<=20142, "Before Alice", "After Alice")) %>% 
  mutate(before_after = factor(before_after, levels = c("Before Alice", "After Alice"))) %>% 
  rename(Group = group) %>% 
  group_by(Group, before_after) %>% 
  summarise(
    `Avg Tobin's Q (log)` = mean(log_tobins_q, na.rm=T),
    `Avg Sales (log)` = mean(log_saleq, na.rm=T),
    `Avg GitHub Commits (log)` = mean(log_github_n_commits, na.rm=T),
    `Avg Patent Scope (log)` = mean(log_avg_n_claims_per_patent, na.rm=T)) %>% 
  gather(key="outcome", value="Value", 
         starts_with("Avg ")) %>% 
  mutate(outcome=factor(
    outcome, levels = c("Avg Tobin's Q (log)", "Avg Sales (log)", 
                        "Avg GitHub Commits (log)", "Avg Patent Scope (log)")))
  
fig2_df %>% 
  ggplot(aes(x = before_after, y = Value, 
             color = Group, shape=Group,
             inetype = Group, group=Group)) + 
  geom_point(size = 3) + 
  geom_line(size=1) + 
  facet_wrap(~outcome, nrow=2, scales = "free") +
  labs(x="",y="")+
  scale_color_manual(values = c("#1565c0", "#c62828", "#707b7c")) +
  theme_Publication(base_size=14)+
  theme(axis.title.x=element_blank(),
        axis.title.y=element_blank(),
        plot.margin = margin(0.2, 0.2, 0.2, 0.2, "cm"),
        legend.key.width=unit(1, "cm"))

```


### Fig S1

In Figure S1, we show that the number of utility patent grants from USPTO every year from 1976 to 2018. We see that the porportion of software patents had been steadly increasing since mid-1990s. The growth of software patenting was slower after the Supreme Court's ruling on Alice. 

```{r cache=T}
sw_patents = read_csv("sw_patents.csv")
patents = read_csv("patents.csv")

figs1_df = patents %>% 
  mutate(sw_patent = patent_id %in% sw_patents$patent_id, 
         year = year(date)) %>% 
  filter(year<=2018) %>% 
  group_by(year) %>% 
  summarise(n_patents = n(),
            n_sw_patents = sum(sw_patent),
            n_non_sw_patents = n_patents-n_sw_patents,
            prop_sw_patents = n_sw_patents/n_patents) %>% 
  arrange(year)

brks_x <- seq(1976, 2018, by=6)
lbls_x <- seq(1976, 2018, by=6)
brks_y <- seq(0, 300000, by=50000)
lbls_y <- str_c(brks_y/1000, "k")
lbls_y[1] <- "0"

ggplot(figs1_df, aes(x=year)) + 
  geom_area(aes(y=n_sw_patents+n_non_sw_patents, fill="#005E7F")) + 
  geom_area(aes(y=n_sw_patents, fill="#CBE4F2")) + 
  geom_vline(xintercept=2014, linetype='solid', color="white", size=1) + 
  geom_text(
    aes(x=2013, y=127000, label="Ruling on Alice", family="Arial"),
    colour="white", angle=90, size=6) +
  labs(x="Time",y="Number of utility patent grants\nfrom USPTO every year") + 
  scale_x_continuous(labels = lbls_x, breaks = brks_x) +
  scale_y_continuous(labels = lbls_y, breaks = brks_y) +
  scale_fill_manual(
    name = "", values = c("#005E7F", "#F58229", "#CBE4F2"), 
    labels = c("Non-Software Patents", "Software Patents")) +
  theme(
    plot.margin = margin(0.2, 0.2, 0.2, 0.2, "cm"), 
    axis.title.x = element_text(size = rel(1.8)))+
  theme_Publication(base_size=14) 
```


## Regression Analysis

Before we can fit regression models, we first need to join data from `qtr_data.csv` and Compustat files. We assume that the data from Fundamentals Quarterly and Fundamentals Annual is stored in `comp_fundq.csv` and `comp_funda.csv`, respectively.

```{r catch=T}
qtr_data <- read_csv("qtr_data_without_comp.csv") %>% 
  mutate(group = factor(group, levels = c("SW", "IT-NON-SW", "NON-IT")))
comp_fundq <- read_csv("comp_fundq.csv")
comp_funda <- read_csv("comp_funda.csv") %>% 
  filter(indfmt == "INDL")

comp = comp_fundq %>% 
  left_join(comp_funda, by=c("gvkey", "datadate")) %>% 
  mutate(
    gvkey = as.numeric(gvkey),
    period_yyyyq = as.numeric(str_replace(datacqtr, "Q","")),
    # Bharadwaj et al. (1999; Management Science)
    tobins_q = (cshoq * prccq + pstkq + dlttq + dlcq)/atq 
  ) %>% 
  arrange(gvkey, period_yyyyq) %>% 
  filter(!is.na(period_yyyyq)) %>% 
  group_by(gvkey) %>% 
  filter(sum(!is.na(emp))>=2, 
         sum(!is.na(xrd))>=2) %>%
  mutate(emp = imputeTS::na_interpolation(emp, option ="linear"),
         xrd = imputeTS::na_interpolation(xrd, option ="linear")) %>%
  ungroup() %>% 
  mutate(
    # Bloom et al. (2013; Econometrica):
    # Tobin’s Q was winsorized by setting it to 0.1 for values below 0.1
    # and at 20 for values above 20 (see also Lanjouw and Schankerman (2004)).
    tobins_q = case_when(
      tobins_q < .1 ~ .1,
      tobins_q > 20 ~ 20,
      T             ~ tobins_q
    )
  ) %>% 
  mutate(
    # Left-truncate the following variable at 0 
    emp = ifelse(emp < 0, 0, emp),
    xrd = ifelse(xrd < 0 , 0, xrd),
    saleq = ifelse(saleq < 0, 0, saleq),
    atq = ifelse(atq < 0, 0, atq),
    chq = ifelse(chq < 0, 0, chq)
  )

market_share <- comp %>% 
  group_by(sic, period_yyyyq) %>% 
  summarise(sic_saleq = sum(saleq, na.rm=T))

comp <- comp %>% 
  left_join(market_share, by = c("period_yyyyq", "sic")) %>% 
  mutate(market_share = ifelse(sic_saleq==0, 0, saleq/sic_saleq)) %>% 
  select(gvkey, period_yyyyq, atq, chq, xrd, emp, 
         saleq, tobins_q, market_share)

df = qtr_data %>% 
  mutate(before_after = ifelse(
    normalized_period>0, "After Alice", "Before Alice")) %>% 
  mutate(before_after = factor(
    before_after, levels = c("Before Alice", "After Alice"))) %>% 
  left_join(comp, by = c("gvkey", "period_yyyyq")) 
```

### Table S2: Data Summary

```{r}
table1(~ patent_stock + n_patents + 
         avg_n_claims_per_patent +
         atq + chq + xrd + emp + market_share +
         tobins_q + saleq + github_n_commits | group, 
       data=df)
```


### Table S3: Main Results

```{r catch=T}
df = df %>% 
  mutate(
    log_tobins_q = log1p(tobins_q),
    log_saleq = log1p(saleq),
    log_github_n_commits = log1p(github_n_commits),
    log_avg_n_claims_per_patent = log1p(avg_n_claims_per_patent),
    log_patent_stock = log1p(patent_stock),
    log_xrd = log1p(xrd),
    log_emp = log1p(emp),
    log_atq = log1p(atq),
    log_chq = log1p(chq),
    log_market_share = log1p(market_share),
    log_age = log1p(age)
  ) %>% 
  mutate(sw_firm = ifelse(group == "SW", 1, 0),
         post_alice = ifelse(normalized_period > 0, 1, 0),
         sw_firm_X_post_alice = sw_firm*post_alice)

fit1a <- felm(log_tobins_q ~ sw_firm_X_post_alice + 
               log_patent_stock + log_xrd + log_emp + 
               log_atq + log_chq + log_market_share + 
               log_age | gvkey + period_yyyyq | 0 | gvkey, df)

fit2a <- felm(log_saleq ~ sw_firm_X_post_alice + 
               log_patent_stock + log_xrd + log_emp + 
               log_atq + log_chq + log_market_share + 
               log_age | gvkey + period_yyyyq | 0 | gvkey, df)

fit3a <- felm(log_github_n_commits ~ sw_firm_X_post_alice + 
               log_patent_stock + log_xrd + log_emp + 
               log_atq + log_chq + log_market_share + 
               log_age | gvkey + period_yyyyq | 0 | gvkey, df)

fit4a <- felm(log_avg_n_claims_per_patent ~ sw_firm_X_post_alice + 
               log_patent_stock + log_xrd + log_emp + 
               log_atq + log_chq + log_market_share + 
               log_age | gvkey + period_yyyyq | 0 | gvkey, df)

tab_model(fit1a, fit2a, fit3a, fit4a, 
          dv.labels = c("Tobin's Q", "Sales", "GitHub Commits", "Patent Scope"),
          show.se = TRUE, show.ci = FALSE, p.threshold = c(0.1, 0.05, 0.01),
          collapse.se = TRUE, p.style = "asterisk", digits = 3)

```


### Table S4: Robustness

```{r catch=T}
df2 = df %>% 
  mutate(
    avg_n_claims_per_patent = ifelse(n_patents==0, 0, avg_n_claims_per_patent),
    log_avg_n_claims_per_patent = ifelse(n_patents==0, 0, log_avg_n_claims_per_patent)) %>% 
  drop_na


fit1b <- felm(log_tobins_q ~ sw_firm_X_post_alice + 
               log_patent_stock + log_xrd + log_emp + 
               log_atq + log_chq + log_market_share + 
               log_age | gvkey + period_yyyyq | 0 | gvkey, df2)

fit2b <- felm(log_saleq ~ sw_firm_X_post_alice + 
               log_patent_stock + log_xrd + log_emp + 
               log_atq + log_chq + log_market_share + 
               log_age | gvkey + period_yyyyq | 0 | gvkey, df2)

fit3b <- felm(log_github_n_commits ~ sw_firm_X_post_alice + 
               log_patent_stock + log_xrd + log_emp + 
               log_atq + log_chq + log_market_share + 
               log_age | gvkey + period_yyyyq | 0 | gvkey, df2)

fit4b <- felm(log_avg_n_claims_per_patent ~ sw_firm_X_post_alice + 
               log_patent_stock + log_xrd + log_emp + 
               log_atq + log_chq + log_market_share + 
               log_age | gvkey + period_yyyyq | 0 | gvkey, df2)

tab_model(fit1b, fit2b, fit3b, fit4b, 
          dv.labels = c("Tobin's Q", "Sales", "GitHub Commits", "Patent Scope"),
          show.se = TRUE, show.ci = FALSE, p.threshold = c(0.1, 0.05, 0.01),
          collapse.se = TRUE, p.style = "asterisk", digits = 3)

```

